/* add size, book-to-market, age */
* add to the dataset industry_skew_major;

libname edu '!userprofile\\Dropbox\Education\Replication\Data'; 

DATA crsp;  /*read all the crsp monthly data with nonmissing return*/
set edu.crsp_data;
IF not missing (RET);
if RET ne 'C';
if RET ne 'B';
year=year(date);
month=month(date);
if 1<=EXCHCD<=3 and SHRCD in (10 11);
naics_3=compress(substr(naics,1,3));
sic_2=floor(siccd/100);
mktcap = abs(prc)*shrout;
run;

/*annualize monthly returns*/
proc sql noprint;
create table annualReturns as
select *,
  count(ret) as nbMonths,
  exp(sum(log(1+ret))) as cumulativeReturn
from crsp
group by permno, year;
select * from annualReturns;
quit;

proc sort data = annualreturns; by permno year month; run;

data annualreturns;
set annualreturns;
by permno year month;
if last.year;
if  nbmonths>=6; /* requires 6 observations in a year */
ann_ret=cumulativeReturn-1;
run;

data mktcap (keep = permco permno year month sic_2 naics_3 mktcap); set annualreturns; mktcap = mktcap * 1000; run;

* calculate book equity;
* this definition is the same as Ken French (http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html);
data book (keep = lpermno lpermco datadate book);
  set edu.crspcompustat_data;
  if TXDB = . then TXDB = 0;
  if ITCB = . then ITCB = 0;
  if PSTKRV ~= . then BVPS = PSTKRV;
  else if PSTKL ~= . then BVPS = PSTKL;
  else if PSTK ~= . then BVPS = PSTK;

  if SEQ ~= . then SE = SEQ;
  else if CEQ ~= . and PSTK ~= . then SE = CEQ - PSTK;
  else SE = AT - LT;

  book = (SE + TXDB + ITCB - BVPS)*1000000;
run;

* calculate log book-to-market for each firm;
* merge book data with mktcap;
proc sql;
  create table btm as
  select a.*, b.book
  from mktcap as a left join book as b
  on a.permno = b.lpermno and a.year = year(b.datadate);
quit;

* keep the larger book value if there are duplicates;
proc sort data = btm; by permno year month descending mktcap descending book; run;
data btm; set btm; by permno year month; if first.month; run;

data btm; set btm; logbtm = log(book/mktcap); run;

* get log firm age;
proc sort data = crsp; by permco year month; run;
data crsp_first; set crsp; by permco year; if first.permco; run;

proc sql;
  create table all as
  select a.*, b.year as first
  from btm as a, crsp_first as b
  where a.permco = b.permco;
quit;

data all;
  set all;
  logage = log(year - first);
run; 

* get lagmktcap, emp for each firm;
proc sql;
  create table all2 as
  select a.*, b.mktcap as lagmktcap
  from all as a left join crsp as b
  on a.permno = b.permno and b.month = 12 and a.year - b.year = 1;
quit;

data emp (keep = lpermno datadate year emp); set edu.crspcompustat_data; year = year(datadate); run; 

proc sort data = emp; by lpermno year descending datadate descending emp; run;
proc sort data = emp nodupkey; by lpermno year; run;

proc sql;
  create table all2 as
  select distinct a.*, b.emp
  from all2 as a left join emp as b
  on a.permno = b.lpermno and a.year = b.year;
quit;

/* for each permco, only the permno that has the largest market cap will be counted (get rid of multiple share classes) */
proc sort data = all2;
  by permco year descending mktcap;
run;

proc means data = all2 noprint;
  by permco year;
  var lagmktcap;
  output out = size sum = lagmktcap;
run;

data all2 (drop = lagmktcap);
  set all2;
  by permco year;
  if first.year;
run;

proc sql;
  create table all2 as
  select a.*, b.lagmktcap
  from all2 as a, size as b
  where a.permco = b.permco and a.year = b.year;
quit;

data all2;
  set all2;
  logmktcap = log(mktcap);
run;

/* map SIC2 and NAICS3 into major code (use SIC2 before 2005; NAICS3 afterwards) */
proc sql;
  create table temp as
  select a.*, b.major 
  from all as a left join edu.sic_2_major as b
  on a.sic_2 = b.sic_2;
quit;

data temp;
  set temp;
  if year >= 2005 then delete;
run;

data temp2; set all; naics3 = input(naics_3, 12.); run;

proc sql;
  create table temp2 as
  select a.*, b.major 
  from temp2 as a left join edu.naics_3_major as b
  on a.naics3 = b.naics_3;
quit;

data temp2 (drop = naics3);
  set temp2;
  if year >= 2005;
run;

data all_major; set temp; run;
proc append base = all_major data = temp2; run;

proc sql;
  create table temp as
  select a.*, b.major 
  from all2 as a left join edu.sic_2_major as b
  on a.sic_2 = b.sic_2;
quit;

data temp;
  set temp;
  if year >= 2005 then delete;
run;

data temp2; set all2; naics3 = input(naics_3, 12.); run;

proc sql;
  create table temp2 as
  select a.*, b.major 
  from temp2 as a left join edu.naics_3_major as b
  on a.naics3 = b.naics_3;
quit;

data temp2 (drop = naics3);
  set temp2;
  if year >= 2005;
run;

data all_major2; set temp; run;
proc append base = all_major2 data = temp2; run;

* log book-to-market and log age;
proc sort data = all_major2; by year major; run;
proc means data = all_major2 noprint;
  by year major;
  var logmktcap logbtm logage;
  weight emp;
  output out = btm_agee_major mean = log_mktcape log_btme log_agee;
run;

/* add to permanent datasets */
proc sql;
  create table edu.industry_skew_major as
  select a.*, b.log_mktcape, b.log_agee, b.log_btme
  from edu.industry_skew_major as a left join btm_agee_major as b
  on a.major = b.major and a.year = b.year;
quit;
